require(GGally, quietly = TRUE)
require(reshape2, quietly = TRUE)
require(tidyverse, quietly = TRUE, warn.conflicts = FALSE)
## Loading tidyverse: ggplot2
## Loading tidyverse: tibble
## Loading tidyverse: tidyr
## Loading tidyverse: readr
## Loading tidyverse: purrr
## Loading tidyverse: dplyr
## Conflicts with tidy packages ----------------------------------------------
## filter(): dplyr, stats
## lag():    dplyr, stats
library(ggfortify)
library(cluster)
library(ggdendro)
library(broom)
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(readr)

theme_set(theme_bw())
personagens = read_csv(file = "../dados/film-dialogue/character_list5.csv")
## Parsed with column specification:
## cols(
##   script_id = col_integer(),
##   imdb_character_name = col_character(),
##   words = col_integer(),
##   gender = col_character(),
##   age = col_character()
## )
personagens = personagens %>%
  filter(age != 'NULL') %>% 
  mutate(age = as.numeric(age))

filmes = read.csv(file = "../dados/film-dialogue/meta_data7.csv")
filmes = filmes %>%
  filter(gross != 'NA')

filmes_personagens = merge(filmes, personagens, by="script_id")

mulheres = filmes_personagens %>%
  filter(gender == 'f') %>%
  group_by(script_id, imdb_id, title, year, gross) %>%
  summarise(n_f=n(), words_f=median(words))

homens = filmes_personagens %>%
  filter(gender == 'm') %>%
  group_by(script_id, imdb_id, title, year, gross) %>%
  summarise(n_m=n(), words_m=median(words))

dados = merge(mulheres, homens, 
                           by=c('script_id','imdb_id','title','year','gross'))
duplicados = dados %>%
  group_by(title) %>% filter(row_number() > 1)

dados = dados %>% 
  filter(!(title %in% duplicados$title))
  
dados = dados %>%
  subset(select = -c(script_id,imdb_id,year,gross))
dw = dados

summary(dw)
##                         title           n_f            words_f      
##  (500) Days of Summer      :   1   Min.   : 1.000   Min.   : 101.0  
##  10 Things I Hate About You:   1   1st Qu.: 2.000   1st Qu.: 340.6  
##  12 Years a Slave          :   1   Median : 3.000   Median : 584.5  
##  127 Hours                 :   1   Mean   : 2.957   Mean   : 827.3  
##  1408                      :   1   3rd Qu.: 4.000   3rd Qu.:1017.9  
##  1492: Conquest of Paradise:   1   Max.   :14.000   Max.   :7664.0  
##  (Other)                   :1572                                    
##       n_m            words_m      
##  Min.   : 1.000   Min.   : 114.0  
##  1st Qu.: 4.000   1st Qu.: 350.0  
##  Median : 6.000   Median : 538.0  
##  Mean   : 6.525   Mean   : 759.4  
##  3rd Qu.: 8.000   3rd Qu.: 903.0  
##  Max.   :23.000   Max.   :5716.0  
## 
dw %>% 
    select(-title) %>% 
    ggpairs()

# Escala de log 
dw2 <- dw %>% 
    mutate_each(funs(log), 2:5)

dw2 %>% 
    select(-title) %>% 
    ggpairs()

summary(select(dw2, -title))
##       n_f            words_f           n_m           words_m     
##  Min.   :0.0000   Min.   :4.615   Min.   :0.000   Min.   :4.736  
##  1st Qu.:0.6931   1st Qu.:5.831   1st Qu.:1.386   1st Qu.:5.858  
##  Median :1.0986   Median :6.371   Median :1.792   Median :6.288  
##  Mean   :0.9166   Mean   :6.392   Mean   :1.740   Mean   :6.364  
##  3rd Qu.:1.3863   3rd Qu.:6.925   3rd Qu.:2.079   3rd Qu.:6.806  
##  Max.   :2.6391   Max.   :8.944   Max.   :3.135   Max.   :8.651
dw2.scaled = dw2 %>% 
  mutate_each(funs(as.vector(scale(.))), 2:5)

summary(dw2.scaled)
##                         title           n_f             words_f        
##  (500) Days of Summer      :   1   Min.   :-1.5573   Min.   :-2.26001  
##  10 Things I Hate About You:   1   1st Qu.:-0.3796   1st Qu.:-0.71409  
##  12 Years a Slave          :   1   Median : 0.3093   Median :-0.02742  
##  127 Hours                 :   1   Mean   : 0.0000   Mean   : 0.00000  
##  1408                      :   1   3rd Qu.: 0.7981   3rd Qu.: 0.67800  
##  1492: Conquest of Paradise:   1   Max.   : 2.9267   Max.   : 3.24527  
##  (Other)                   :1572                                       
##       n_m              words_m       
##  Min.   :-3.14947   Min.   :-2.3466  
##  1st Qu.:-0.64030   1st Qu.:-0.7295  
##  Median : 0.09358   Median :-0.1097  
##  Mean   : 0.00000   Mean   : 0.0000  
##  3rd Qu.: 0.61428   3rd Qu.: 0.6369  
##  Max.   : 2.52572   Max.   : 3.2971  
## 
dw2.scaled %>% 
    select(-title) %>% 
    ggpairs()

dists = dw2.scaled %>%
      column_to_rownames("title") %>% 
    dist(method = "euclidean")

hc = hclust(dists, method = "ward.D")

plot(hc, cex = .6)

plot(hc, hang = -1)

n_clusters = 4
rect.hclust(hc, k=n_clusters)

dw2 <- dw2 %>% 
    mutate(cluster = hc %>% 
               cutree(k = n_clusters) %>% 
               as.character())

dw2.scaled <- dw2.scaled %>% 
    mutate(cluster = hc %>% 
               cutree(k = n_clusters) %>% 
               as.character())

dw2.long = melt(dw2.scaled, id.vars = c("title", "cluster"))

hc %>% 
    cutree(k = n_clusters) %>% 
    silhouette(dists) %>% 
    plot(col = RColorBrewer::brewer.pal(n_clusters, "Set2"))

dw2.long %>% 
    ggplot(aes(x = variable, y = value, group = title, colour = cluster)) + 
    geom_line(alpha = 0.4) + 
    facet_wrap(~ cluster) 

p <- dw2 %>%
    plot_ly(type = 'parcoords',
            line = list(color = ~cluster),
            dimensions = list(
                #list(range = c(1, 4), label = "cluster", values = ~cluster),
                list(range = c(0, 4),
                     label = 'n_f', values = ~n_f),
                list(range = c(0, 4),
                     constraintrange = c(5,6),
                     label = 'words_f', values = ~words_f),
                list(range = c(0, 4),
                     label = 'n_m', values = ~n_m),
                list(range = c(0, 4),
                     label = 'words_m', values = ~words_m)
            )
    )
p

k-means

dw2.scaled = dw2.scaled %>% 
    select(-cluster) # Remove o cluster adicionado antes lá em cima via hclust

# O agrupamento de fato:
km = dw2.scaled %>% 
    select(-title) %>% 
    kmeans(centers = n_clusters, nstart = 20)

# O df em formato longo, para visualização 
dw2.scaled.km.long = km %>% 
    augment(dw2.scaled) %>% # Adiciona o resultado de km 
                            # aos dados originais dw2.scaled em 
                            # uma variável chamada .cluster
    gather(key = "variável", 
           value = "valor", 
           -title, -.cluster) # = move para long todas as 
                                            # variávies menos repository_language 
                                            # e .cluster
dw2.scaled.km.long %>% 
    ggplot(aes(x = `variável`, y = valor, group = title, colour = .cluster)) + 
    #geom_point(alpha = 0.2) + 
    geom_line(alpha = .5) + 
    facet_wrap(~ .cluster) 

autoplot(km, data = dw2.scaled, label = TRUE)

dists = dw2.scaled %>% 
    select(-title) %>% 
    dist() # só para plotar silhouetas depois
plot(silhouette(km$cluster, dists), col = RColorBrewer::brewer.pal(n_clusters, "Set2"))

#não funciona...
#table(km %>% pull(cluster))
#km %>% pull(cluster) %>% table()
#summary(dw2.scaled)
p <- km %>% 
    augment(dw2.scaled) %>%
    plot_ly(type = 'parcoords',
            line = list(color = ~.cluster, 
                        showScale = TRUE),
            dimensions = list(
                #list(range = c(1, 4), label = "cluster", values = ~cluster),
                list(range = c(-3, 3),
                     label = 'n_f', values = ~n_f),
                list(range = c(-3, 3),
                     label = 'words_f', values = ~words_f),
                list(range = c(-6, 3),
                     label = 'n_m', values = ~n_m),
                list(range = c(-2, 3),
                     label = 'words_m', values = ~words_m)
            )
    )
p

Qual seria um bom valor de k? Uma medida comumente usada no kmeans é comparar a distância (quadrática) entre o centro dos clusters e o centro dos dados com a distância (quadrática) entre os pontos todos nos dados e o centro dos dados. Aqui o centro dos dados é um ponto imaginário na média de todas as variáveis. Calculamos a distância do centro de cada cluster para o centro dos dados e multiplicamos pelo número de pontos nesse cluster. Somando esse valor para todos os clusters, temos betweenss abaixo. Se esse valor for próximo do somatório total das distâncias dos pontos para o centro dos dados (totss), os pontos estão próximos do centro de seu cluster. Essa proporção pode ser usada para definir um bom valor de k. Quando ela para de crescer, para de valer à pena aumentar k.

set.seed(123)
explorando_k = tibble(k = 1:15) %>% 
    group_by(k) %>% 
    do(
        kmeans(select(dw2.scaled, -title), 
               centers = .$k, 
               nstart = 20) %>% glance()
    )

explorando_k %>% 
    ggplot(aes(x = k, y = betweenss / totss)) + 
    geom_line() + 
    geom_point()